##########################################################################
# Project:    Commission communication
# Task:       Scrape links to EC press releases (after the website revamp)
# Author:     Christian Rauh (25.07.2022)
##########################################################################

# Note - this is an indicative file on how to scrape Commission press releases from
# the current (25.07.2022) online archive for those who want to extend the published corpus

# Raw HTMLs and procedures to transfer them into the published corpus are too large 
# for the replication and can be requested from @ChRauh


# Background
# Search results for Comm PRs between January 17 1985 and January 8 2021 can be seen here:
# https://ec.europa.eu/commission/presscorner/advancedsearch/en?keywords=&dotyp=1&parea=&pareaType=&datepickerbefore=8%20January%202021&datebefore=Fri%20Jan%2008%202021%2000:00:00%20GMT%2B0100%20(Mitteleurop%C3%A4ische%20Normalzeit)&commissioner=&datepickerafter=17%20January%201985&dateafter=Thu%20Jan%2017%201985%2000:00:00%20GMT%2B0100%20(Mitteleurop%C3%A4ische%20Normalzeit)&pagenumber=1
# JS rendered results page with 10 hits each, as of today 46509 results (=4651 search results pages)
# Approach: headless browsing to render page, then extract links to individual PRs

# Packages ####
library(tidyverse) # 1.3.0
library(rvest) # 0.3.6
library(lubridate) # 1.7.9


# Harvest links to IPs ####

urls <- data.frame()

for(i in 1:4651) { # Loop over number of search result pages
  
  # Show progress
  print(i)
  print(round((i/4651)*100,2))
  
  # Construct search page URL
  url <- paste0("https://ec.europa.eu/commission/presscorner/advancedsearch/en?keywords=&dotyp=1&parea=&pareaType=&datepickerbefore=8%20January%202021&datebefore=Fri%20Jan%2008%202021%2000:00:00%20GMT%2B0100%20(Mitteleurop%C3%A4ische%20Normalzeit)&commissioner=&datepickerafter=17%20January%201985&dateafter=Thu%20Jan%2017%201985%2000:00:00%20GMT%2B0100%20(Mitteleurop%C3%A4ische%20Normalzeit)&pagenumber=", i)
  
  # Export a PhantomJS script that renders the current URL with JS content
  # Here with a 110 secs timeout function, in the hope that the page is rendered completely by then
  writeLines(sprintf("var page = require('webpage').create();
  page.open('%s', function () {
    setTimeout(function(){
	console.log(page.content); //page source
    phantom.exit();
	}   , 5000);
                     });", url), con="scrape.js")
  
  # Invoke Phantom JS to execute the current script
  # and store the rendered html code into a text object
  rendered <- system("phantomjs scrape.js", intern = T)
  
  # Store rendere HTML to disc
  writeLines(rendered, con = "CurrentRender.html") # export to html file
  
  # Parse html
  page <- read_html("CurrentRender.html")
  
  # Get titles of IPs on search page
  # Breaks for individual headings, disregarded for now
  # titles <- page %>% 
  #   html_nodes(".ecl-list-item__body") %>% 
  #   html_nodes(".ecl-heading") %>% 
  #   html_text() %>% 
  #   as.data.frame() %>% 
  #   rename(headline = 1)
  
  # Get links to IPs
  links <- page %>% 
    html_nodes("a") %>% 
    html_attr("href") %>% 
    as.data.frame() %>% 
    rename(link = 1) %>% 
    filter(str_detect(link, "en/ip")) # Only those links pointing to an english IP (searched with english language setting!)
  
  # Get pub dates of IPs
  # dates <- page %>% 
  #   html_nodes(".ecl-list-item__body") %>% 
  #   html_nodes(".ecl-meta__item") %>% 
  #   html_text() %>% 
  #   as.data.frame() %>% 
  #   rename(date = 1) %>% 
  #   filter(!str_detect(date, "Press release")) # Not type meta item
  
  # Combine harvested info
  # res <- as.d(dates, titles, links)
  
  # Write to target DF
  urls <- rbind(urls, links)  
  
}

# Complete the links
urls$link <- paste0("https://ec.europa.eu/commission/presscorner/", urls$link)

# Clean the date
# urls$date <- dmy(urls$date) %>% as.character()

# Export url list
write_rds(urls, "./Data/Raw_IP_links.Rds")



# Download press releases ####

# N.B.: Takes a long time and requires much local memory
# Proceed with caution

# Also JS rendering is necessary here

# Reload, and correct errors in the source
urls <- read_rds("./Data/Raw_IP_links.Rds") %>% 
  mutate(link = str_remove_all(link, "\\s"))
# urls$link[758]


# Establish target file name
# Following conventions of my original scraper
urls$file <- str_extract(urls$link, "ip.*?$") %>% 
  toupper()
urls$file <- paste0("./PressReleasesRAW/",urls$file, "_en.htm")

# Correct some errors in source
urls$test <- str_detect(urls$link, "ip_[0-9]{2}_[0-9]{1,4}$")


# Loop over links and DL
for (i in 1:nrow(urls)){
  
  # Show progress
  print(round(i/nrow(urls)*100, digits = 2))
  print(urls$link[i])
  print(i)
  
  # Check whether file is already available
  if(file.exists(urls$file[i])) {next}
  
  
  # Export a PhantomJS script that renders the current URL with JS content
  # Here with a 110 secs timeout function, in the hope that the page is rendered completely by then
  writeLines(sprintf("var page = require('webpage').create();
  page.open('%s', function () {
    setTimeout(function(){
	console.log(page.content); //page source
    phantom.exit();
	}   , 5000);
                     });", urls$link[i]), con="scrape.js")
  
  # Invoke Phantom JS to execute the current script
  # and store the rendered html code into a text object
  rendered <- system("phantomjs scrape.js", intern = T)
  
  # Store rendered HTML to disc
  writeLines(rendered, con = urls$file[i]) # export to html file
  
}

# Cross-check
urls$exist <- file.exists(urls$file)
